Данные

Данные об университетах.

Признаки:

uni_data <- read.csv2("I.csv")

Покажем, что данные неоднородны. Посмотрим, например, на скаттерплот IN_STATE vs INSTRUCT.

plot(uni_data$IN_STATE, uni_data$INSTRUCT, main = "Grouped by PPIND", xlab = "IN_STATE", ylab = "INSTRUCT", bg=c("red","blue")[uni_data$PPIND],
               pch=21+as.numeric(uni_data$PPIND))

Далее будем рассматривать только государственные университеты (PPIND == 1)

Признаков очень много. Возможно, стоит разбить количественные признаки на группы и выбрать какие-нибудь представители от них.

Разобьем, например, так:

## $SAT_ACT_results
##  [1] "AVRMATH" "AVRVERB" "AVRCOMB" "AVR_ACT" "MATH_1"  "MATH_3"  "VERB_1"  "VERB_3"  "ACT_1"   "ACT_3"  
## 
## $Costs
## [1] "R_B_COST" "ROOM"     "BOARD"    "ADD_FEE"  "BOOK"     "PERSONAL"
## 
## $Benefits
## [1] "IN_STATE" "OUT_STAT" "INSTRUCT"
## 
## $Application
## [1] "APP_REC"  "APP_ACC"  "NEW_STUD" "NEW10"    "NEW25"   
## 
## $Quality
## [1] "SF_RATIO" "PH_D"     "TERM_D"   "DONATE"  
## 
## $Students
## [1] "FULLTIME" "PARTTIME" "GRADUAT" 
## 
## $Salaries
## [1] "SAL_FULL" "SAL_AC"   "SAL_AS"   "SAL_ALL" 
## 
## $NumbersOfPeople
## [1] "NUM_FULL" "NUM_AC"   "NUM_AS"   "NUM_ALL" 
## 
## $Comp
## [1] "COMP_FUL" "COMP_AC"  "COMP_AS"  "COMP_ALL"

Построим matrix plot’ы.

uni_data_public <- uni_data[uni_data$PPIND == 1,]
groupsNames <- names(types)
mtp <- lapply(groupsNames, function(group){
  pairs.panels(uni_data_public[, types[[group]]],
               lm=TRUE, 
               ellipses = FALSE,
               main = group)
})

Прологорифмируем все кроме SF_RATIO, GRADUAT, PH_D, TERM_D и попробуем еще раз. Кроме того, выпишем количество пропусков.

uni_data_log <- uni_data_public
uni_data_log[,c(-1,-2,-3,-4,-5, -31,-32, -33, -36)] <- apply(uni_data_log[,c(-1,-2,-3,-4,-5,-31,-32, -33, -36)], 2, log)

mtp <- lapply(types, function(cols){
    pairs.panels(uni_data_log[,cols],
                 lm=TRUE,
                 ellipses = FALSE)
    lst <- lapply(uni_data_log[, cols], function(col){
    length(col[is.na(col)])
  })
     print(lst)
})

## $AVRMATH
## [1] 43
## 
## $AVRVERB
## [1] 43
## 
## $AVRCOMB
## [1] 43
## 
## $AVR_ACT
## [1] 49
## 
## $MATH_1
## [1] 32
## 
## $MATH_3
## [1] 32
## 
## $VERB_1
## [1] 32
## 
## $VERB_3
## [1] 32
## 
## $ACT_1
## [1] 45
## 
## $ACT_3
## [1] 45

## $R_B_COST
## [1] 0
## 
## $ROOM
## [1] 34
## 
## $BOARD
## [1] 54
## 
## $ADD_FEE
## [1] 33
## 
## $BOOK
## [1] 2
## 
## $PERSONAL
## [1] 10

## $IN_STATE
## [1] 7
## 
## $OUT_STAT
## [1] 1
## 
## $INSTRUCT
## [1] 0

## $APP_REC
## [1] 1
## 
## $APP_ACC
## [1] 1
## 
## $NEW_STUD
## [1] 0
## 
## $NEW10
## [1] 15
## 
## $NEW25
## [1] 22

## $SF_RATIO
## [1] 0
## 
## $PH_D
## [1] 5
## 
## $TERM_D
## [1] 11
## 
## $DONATE
## [1] 12

## $FULLTIME
## [1] 0
## 
## $PARTTIME
## [1] 0
## 
## $GRADUAT
## [1] 4

## $SAL_FULL
## [1] 0
## 
## $SAL_AC
## [1] 0
## 
## $SAL_AS
## [1] 0
## 
## $SAL_ALL
## [1] 0

## $NUM_FULL
## [1] 0
## 
## $NUM_AC
## [1] 0
## 
## $NUM_AS
## [1] 0
## 
## $NUM_ALL
## [1] 0

## $COMP_FUL
## [1] 0
## 
## $COMP_AC
## [1] 0
## 
## $COMP_AS
## [1] 0
## 
## $COMP_ALL
## [1] 0

В качестве основных признаков выберем те, что больше всего коррелируют с остальными в группе и содержат меньше всего пропусков. Выберем основные признаки и построим matrix plot’ы для них.

types$Primary = c("NEW10", "R_B_COST","ADD_FEE", 
                  "BOOK", "SAL_ALL", "NUM_ALL" ,"APP_ACC", "PH_D")

uni_data_log <- na.omit(uni_data_log[,types$Primary])
uni_data_log <- as.data.frame(scale(uni_data_log[,types$Primary]))

mtp <- pairs.panels(uni_data_log[,types$Primary], 
                    lm=TRUE,
                    ellipses = FALSE)

Shapiro-Wilks Test до логарифмирования

print(lapply(uni_data_public[,types$Primary], function(x) { shapiro.test(x)$p.value}))
## $NEW10
## [1] 9.678601e-11
## 
## $R_B_COST
## [1] 0.0008010331
## 
## $ADD_FEE
## [1] 1.457477e-14
## 
## $BOOK
## [1] 9.108105e-05
## 
## $SAL_ALL
## [1] 0.3888914
## 
## $NUM_ALL
## [1] 3.875619e-05
## 
## $APP_ACC
## [1] 5.484138e-08
## 
## $PH_D
## [1] 0.1242419

Shapiro-Wilks Test после логарифмирования

print(lapply(uni_data_log[,types$Primary], function(x) { shapiro.test(x)$p.value}))
## $NEW10
## [1] 0.03108544
## 
## $R_B_COST
## [1] 0.7211772
## 
## $ADD_FEE
## [1] 0.01164912
## 
## $BOOK
## [1] 0.1592939
## 
## $SAL_ALL
## [1] 0.5915829
## 
## $NUM_ALL
## [1] 0.01269193
## 
## $APP_ACC
## [1] 0.01923453
## 
## $PH_D
## [1] 0.1157398

Признаки стали ближе к нормальным.

Регрессия

Посмотрим на регрессию

l <- lm(NEW10 ~ R_B_COST + ADD_FEE + BOOK
        + SAL_ALL + NUM_ALL + APP_ACC + PH_D, uni_data_log)
summary(l)
## 
## Call:
## lm(formula = NEW10 ~ R_B_COST + ADD_FEE + BOOK + SAL_ALL + NUM_ALL + 
##     APP_ACC + PH_D, data = uni_data_log)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.15731 -0.50506  0.03888  0.48687  1.70733 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -9.194e-16  9.578e-02   0.000   1.0000  
## R_B_COST     7.819e-02  1.236e-01   0.632   0.5292  
## ADD_FEE      1.825e-01  1.104e-01   1.653   0.1029  
## BOOK         1.795e-01  1.020e-01   1.761   0.0827 .
## SAL_ALL      2.269e-01  1.397e-01   1.624   0.1090  
## NUM_ALL      8.922e-02  1.717e-01   0.519   0.6051  
## APP_ACC     -8.486e-02  1.731e-01  -0.490   0.6255  
## PH_D         2.822e-01  1.200e-01   2.353   0.0215 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## s: 0.8405 on 69 degrees of freedom
## Multiple R-squared: 0.3587,
## Adjusted R-squared: 0.2936 
## F-statistic: 5.512 on 7 and 69 DF,  p-value: 4.602e-05

Регрессия значима.

Redundancy analysis

red <- redun(~ R_B_COST + ADD_FEE + BOOK
             + SAL_ALL + NUM_ALL + APP_ACC + PH_D,
             uni_data_log, type = "ordinary", r2 = 0.0000001)$rsq1

pcors <- pcor(na.omit(uni_data_log[types$Primary]))$estimate[1,-1]
semi.pcors <- spcor(na.omit(uni_data_log[types$Primary]))$estimate[1,-1]

redundancy <- data.frame(cbind(red, 1 - red, pcors,semi.pcors), 
                         row.names = types$Primary[-1])

colnames(redundancy) <- c("Multiple R^2", "Tolerance", "Partial cor", "Semi-partial cor")
print(redundancy)
##          Multiple R^2 Tolerance Partial cor Semi-partial cor
## R_B_COST    0.4416787 0.5583213  0.07591831       0.06097414
## ADD_FEE     0.3131245 0.6868755  0.19517176       0.15936545
## BOOK        0.1704369 0.8295631  0.20734853       0.16974127
## SAL_ALL     0.6340405 0.3659595  0.19186325       0.15655974
## NUM_ALL     0.7408881 0.2591119  0.06241516       0.05008199
## APP_ACC     0.7496509 0.2503491 -0.05891074      -0.04725996
## PH_D        0.4539229 0.5460771  0.27250154       0.22681288

Удалим APP_ACC как признак с самым большим \(R^2\) и самым маленьким pcor. Посмотрим, что изменилось.

types$Primary <- types$Primary[-8]
l <- lm(NEW10 ~ R_B_COST + + ADD_FEE + BOOK +  SAL_ALL + NUM_ALL + PH_D, uni_data_log)
summary(l)
## 
## Call:
## lm(formula = NEW10 ~ R_B_COST + +ADD_FEE + BOOK + SAL_ALL + NUM_ALL + 
##     PH_D, data = uni_data_log)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.12378 -0.51621  0.05527  0.49049  1.69568 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -8.483e-16  9.526e-02   0.000   1.0000  
## R_B_COST     5.789e-02  1.159e-01   0.500   0.6189  
## ADD_FEE      1.639e-01  1.031e-01   1.589   0.1165  
## BOOK         1.764e-01  1.012e-01   1.743   0.0857 .
## SAL_ALL      2.203e-01  1.383e-01   1.593   0.1157  
## NUM_ALL      2.641e-02  1.137e-01   0.232   0.8171  
## PH_D         2.901e-01  1.182e-01   2.454   0.0166 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## s: 0.8359 on 70 degrees of freedom
## Multiple R-squared: 0.3564,
## Adjusted R-squared: 0.3013 
## F-statistic: 6.461 on 6 and 70 DF,  p-value: 1.87e-05

Почти не изменился \(R^2\) (был 0.3587, стал 0.3564). Значимость регрессии стала лучше (p-value: 4.602e-05 vs 1.87e-05).

Stepwise regression

Пошаговая регрессия

forward(l,0.99999)
## Forward selection, alpha-to-enter: 0.99999
## 
## Full model: NEW10 ~ R_B_COST + +ADD_FEE + BOOK + SAL_ALL + NUM_ALL + PH_D
## <environment: 0x7fd596716d80>
## 
##          Step    RSS     AIC  R2pred     Cp F value    Pr(>F)    
## PH_D        1 56.773 -19.466 0.21262 8.2505 25.4004 3.132e-06 ***
## SAL_ALL     2 52.753 -23.120 0.25147 4.4979  5.6385   0.02016 *  
## BOOK        3 51.114 -23.551 0.26049 4.1515  2.3416   0.13028    
## ADD_FEE     4 49.105 -24.638 0.25973 3.2769  2.9451   0.09044 .  
## R_B_COST    5 48.949 -22.883 0.23350 5.0539  0.2260   0.63599    
## NUM_ALL     6 48.912 -20.942 0.21875 7.0000  0.0539   0.81709    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call:
## lm(formula = NEW10 ~ PH_D + SAL_ALL + BOOK + ADD_FEE + R_B_COST + 
##     NUM_ALL, data = uni_data_log)
## 
## Coefficients:
## (Intercept)         PH_D      SAL_ALL         BOOK      ADD_FEE     R_B_COST      NUM_ALL  
##  -8.483e-16    2.901e-01    2.203e-01    1.764e-01    1.639e-01    5.789e-02    2.641e-02

В случае forward regression минимум AIC достигается при добавлении четырех признаков. Заметим, что после добавления SAL_ALL разница в значениях AIC маленькая и может быть объяснена случайностью.

backward(l,1e-9)
## Backward elimination, alpha-to-remove: 1e-09
## 
## Full model: NEW10 ~ R_B_COST + +ADD_FEE + BOOK + SAL_ALL + NUM_ALL + PH_D
## <environment: 0x7fd599050340>
## 
##          Step    RSS       AIC    R2pred      Cp F value    Pr(>F)    
## NUM_ALL     1 48.949 -22.88273  0.233497  5.0539  0.0539   0.81709    
## R_B_COST    2 49.105 -24.63805  0.259734  3.2769  0.2260   0.63599    
## ADD_FEE     3 51.114 -23.55113  0.260491  4.1515  2.9451   0.09044 .  
## BOOK        4 52.753 -23.12003  0.251473  4.4979  2.3416   0.13028    
## SAL_ALL     5 56.773 -19.46577  0.212623  8.2505  5.6385   0.02016 *  
## PH_D        6 76.000   0.99345 -0.026489 33.7678 25.4004 3.132e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call:
## lm(formula = NEW10 ~ 1, data = uni_data_log)
## 
## Coefficients:
## (Intercept)  
##   3.416e-16

В backward regression минимум AIC достигается на четвертом шаге, но здесь снова очень маленькая разница между значениями критерия.

Оставим только признаки PH_D и SAL_ALL.

types$Primary <- c("NEW10", "PH_D", "SAL_ALL")

Outliers

Аутлаеры по Махаланобису

m <- sort(mahalanobis(uni_data_log[types$Primary[-1]], colMeans(uni_data_log[types$Primary[-1]]), cov(uni_data_log[types$Primary[-1]])))
plot(m, main = "Machalanobis distance", xlab = "Individual", ylab = "distance", ylim = c(0,(max(m) + 1)))
text(m, labels = names(m), cex= 0.7, pos=3)

out <- c("112")

uni_data_log <- uni_data_log[!rownames(uni_data_log) %in% out,]
uni_data[out,c("X", types$Primary[-1])]
##                        X PH_D SAL_ALL
## 112 University of North    97     379

Аутлаер – государственный университет, где самый высокий процент PhD.

Аутлаеры по Куку.

cook <- sort(cooks.distance(l))
plot( cook, main = "Cook's distance", xlab = "Individual", ylab = "distance", ylim = c(0,0.2))
abline(h = 4/(nrow(uni_data_log) - length(types$Primary) + 2), col = "red")
text(cook, labels = names(cook), cex= 0.7, pos=3)

out <- c("37", "74")
print(uni_data[out,c("X", types$Primary[-1])])
##                       X PH_D SAL_ALL
## 37 Georgia State Univer   87     502
## 74 University of Massac   88     607
uni_data_log <- uni_data_log[!rownames(uni_data_log) %in% out,]

Аутлаеры – государственные университеты, где высокий процент PhD и при этом большие зарплаты.

Residuals vs deleted residuals

Посмотрим на нормальность остатков.

l <- lm(NEW10 ~ PH_D + SAL_ALL, uni_data_log)
summary(l)
## 
## Call:
## lm(formula = NEW10 ~ PH_D + SAL_ALL, data = uni_data_log)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.81492 -0.45477 -0.02579  0.46737  1.49861 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.06574    0.09076   0.724 0.471256    
## PH_D         0.39358    0.11396   3.454 0.000936 ***
## SAL_ALL      0.27423    0.11782   2.328 0.022791 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## s: 0.7798 on 71 degrees of freedom
## Multiple R-squared: 0.3717,
## Adjusted R-squared: 0.354 
## F-statistic:    21 on 2 and 71 DF,  p-value: 6.846e-08
plot(l, which = 1)

shapiro.test(l$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  l$residuals
## W = 0.98245, p-value = 0.3949

Normal Probability Plot

ef <- ecdf(l$residuals)
plot(l$residuals, qnorm(ef(l$residuals)), xlab = "Residuals", ylab = "Sample Quantiles", main = "Normal Probability Plot")
abline(a = 0, b = 1)

Residual analysis

df <- na.omit(uni_data_log[,types$Primary])
resid.list <- sapply(1:nrow(df), function(i){
  LM <- lm(NEW10 ~ PH_D + SAL_ALL, df)
  resid.with.i <- df[i,"NEW10"] - as.numeric(LM$coefficients) %*%  c(1,as.numeric(df[i,-1]))
  LM <- lm(NEW10 ~ PH_D + SAL_ALL, df[-i,]) 
  resid.without.i <- df[i,"NEW10"] - as.numeric(LM$coefficients) %*% c(1,as.numeric(df[i,-1]))
  return(c(resid.with.i, resid.without.i))
})

plot(resid.list[1,], resid.list[2,], 
     main = "Residuals vs Deleted Residuals", 
     xlab = "Residuals", 
     ylab = "Deleted Residuals")
abline(a = 0, b =1, col = "red")

Видно, что Residuals \(\leq\) Deleted Residuals.